package com.code.crawler;

import com.code.stringutil.StringUtils;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.getApiData.GetUrlFile;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

import java.io.File;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

/**
 * 获取抖音网页版数据
 *
 * @author 祁雪
 */
public class CrawlerDouYin {

    /**
     * 爬取用户发布页内容
     *
     * @param url
     * @throws Exception
     */
    public static void getUserIssue(String url,
                                    String savePath) throws Exception {

        WebClient webClient = new WebClient(BrowserVersion.CHROME);// 开启一个狗狗浏览器客户端对象
        // 当JS执行出错的时候是否抛出异常
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        // 当HTTP的状态非200时是否抛出异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setActiveXNative(false);
        // 启用css加载
        webClient.getOptions().setCssEnabled(false);
        // 启用js加载
        webClient.getOptions().setJavaScriptEnabled(true);
        // 支持ajax加载
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());

        try {

            System.out.println("进入用户页: " + url);
            HtmlPage page = webClient.getPage(url);

            System.out.println("检索视频详情页链接");
            Document document = Jsoup.parse(page.asXml());

            Elements as = document.select("a");

            String title = document.selectFirst("h1").text();

            String path = savePath + File.separator + title;

            List<String> urls = new ArrayList<>();

            for (Element a : as) {
                if (a.hasAttr("href")) {
                    String href = a.attr("href");
                    if (href.startsWith("https://www.douyin.com/video/")) {
                        urls.add(href);
                    }
                }
            }

            getVideoInfo(urls, path, null);
        } catch (Exception e) {
            e.printStackTrace();
        }

        webClient.close();

    }


    public static void getVideoInfo(List<String> urls,
                                    String savePath, WebDriver webDriver) throws Exception {
        if (webDriver == null) {

            String driver = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe";
            System.setProperty("webdriver.chrome.driver", driver);

            ChromeOptions chromeOptions = new ChromeOptions();
//        chromeOptions.addArguments("-headless");

            webDriver = new ChromeDriver(chromeOptions);
            webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);

        }

        for (int i = 0; i < urls.size(); i++) {

            String url = urls.get(i);

            System.out.println("进入详情: " + url);
            // 访问地址
            webDriver.get(url);

            Thread.sleep(10000);

            Document document = Jsoup.parse(webDriver.getPageSource());

            Element video = document.selectFirst("video");

            Element h1 = document.selectFirst("h1");

            if (video != null) {
                String src = "https:" + video.attr("src");
                System.out.println("下载视频: " + src);
                // 下载视频
                String fileName = h1.text() + ".mp4";
                fileName = fileName.replaceAll("\\||\\?", "");
                System.out.println("文件名称: " + fileName);
                File file = new File(savePath, fileName);
                if (file.exists()) {
                    System.out.println(fileName + " => 已下载");
                    continue;
                }
                GetUrlFile.downLoadHttpUrl(src, savePath, fileName);
            }
        }
        webDriver.close();
    }

    /**
     * 搜索抖音
     *
     * @param key      搜索词
     * @param sort     排序 0.综合排序 1.最多点赞 2.最新发布
     * @param savePath 保存位置
     */
    public static void search(String key,
                              int sort,
                              String savePath) {

        String driver = "C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe";
        System.setProperty("webdriver.chrome.driver", driver);

        ChromeOptions chromeOptions = new ChromeOptions();
//        chromeOptions.addArguments("-headless");

        WebDriver webDriver = new ChromeDriver(chromeOptions);
        webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);

        String url = "https://www.douyin.com/search/{0}?publish_time=0&sort_type={1}&source=search_history&type=video&aid={2}";

        try {
            url = StringUtils.format(url, key, String.valueOf(sort), UUID.randomUUID().toString());

            System.out.println("访问: " + url);
            webDriver.get(url);

            System.out.println("等待手动通过验证");
            Thread.sleep(10000);

            System.out.println("刷新页面");
            webDriver.get(url);
            Thread.sleep(10000);

            System.out.println("检索dom");
            Document document = Jsoup.parse(webDriver.getPageSource());

            Elements as = document.select("a");

            List<String> urls = new ArrayList<>();
            for (Element a : as) {
                if (a.hasAttr("href")) {
                    String href = a.attr("href");
                    if (href.startsWith("https://www.douyin.com/video/")) {
                        urls.add(href);
                    }
                }
            }

            urls = urls.stream().distinct().collect(Collectors.toList());

            System.out.println("已筛选出" + urls.size() + "条视频");
            String path = savePath + File.separator + "search";

            getVideoInfo(urls, path, webDriver);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws Exception {

        String url = "https://www.douyin.com/user/MS4wLjABAAAAQq8_8RtbEcjUAdeLAKETmbrvt6jdPfRNoI60SkH6J1I?enter_method=search_result&extra_params=%7B%22search_id%22%3A%22202109241115100101310570861F1E9FD3%22%2C%22search_result_id%22%3A%2216622344680%22%2C%22search_keyword%22%3A%22%E7%BE%8E%E5%A5%B3%22%2C%22search_type%22%3A%22video%22%7D&enter_from=search_result";
//        String url = "https://www.douyin.com/video/7008058051248901410";

        String videoSavePath = "D:\\爬虫文件\\抖音";
        String fileName = "测试.mp4";
//        getUserIssue(url, videoSavePath);
        search("美女禁欲系御姐跳舞", 1, videoSavePath);
//        getVideoInfo(url, videoSavePath, fileName);
        // String tets = "姐姐的腰不是腰?#变速扭胯舞 #肚皮舞 #御姐.mp4";
        // System.out.println(tets.replaceAll("\\||\\?", ""));
    }
}
